condition by time point

only consider time points in KO village samples

libraries

library(Seurat)
library(Matrix)
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(cowplot)  
library(patchwork)
library(stringr)
library(pheatmap)
library(reshape2)
library(readxl)
library(stringr)
library(DESeq2)
library(gridExtra)
library(ggrepel)
library(ggpubr)
library(tidyverse)

theme_set(theme_classic())


data_dir = "/home/zli4/MSK_DEC24/Analysis"

out_dir = "/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/cell_type_composition"

set.seed(022520)
lm_plot=function(mod){
  summary_model= summary(mod)
  coefficients= summary_model$coefficients
  coeff_df= as.data.frame(coefficients)
  coeff_df$Predictor= rownames(coeff_df)

  significant_predictors= coeff_df %>%
    filter(Predictor != "(Intercept)" & `Pr(>|t|)` < 0.05)
  

  conf_int= confint(mod)
  conf_df= as.data.frame(conf_int)
  conf_df$Predictor= rownames(conf_df)
  
  # Merge with the significant predictors
  plot_data= merge(significant_predictors, conf_df, by = "Predictor")
  names(plot_data)= c("Predictor", "Estimate", "Std.Error", "t.value", "p.value", "CI.Lower", "CI.Upper")
  
  plot_data$Predictor = gsub(paste(c("genotype"), collapse = "|"), "", plot_data$Predictor)
  fig = ggplot(plot_data, aes(x = reorder(Predictor, Estimate), y = Estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = CI.Lower, ymax = CI.Upper), width = 0.2) +
    coord_flip() +  # Flip the coordinates for better readability
    theme_minimal() +
    labs(
      title = "Estimates and Confidence Intervals for Significant Predictors",
      x = "",
      y = "Estimate"
    )
  return(fig)
}

lm_plot_exp=function(mod){
  summary_model= summary(mod)
  coefficients= summary_model$coefficients
  coeff_df= as.data.frame(coefficients)
  coeff_df$Predictor= rownames(coeff_df)

  significant_predictors= coeff_df %>%
    filter(Predictor != "(Intercept)" & `Pr(>|t|)` < 0.05)
  

  conf_int= confint(mod)
  conf_df= as.data.frame(conf_int)
  conf_df$Predictor= rownames(conf_df)
  
  # Merge with the significant predictors
  plot_data= merge(significant_predictors, conf_df, by = "Predictor")
  names(plot_data)= c("Predictor", "Estimate", "Std.Error", "t.value", "p.value", "CI.Lower", "CI.Upper")
  
  plot_data$Predictor = gsub(paste(c("genotype"), collapse = "|"), "", plot_data$Predictor)
  fig = ggplot(plot_data, aes(x = reorder(Predictor, Estimate), y = exp(Estimate))) +
    geom_point() +
    geom_errorbar(aes(ymin = exp(CI.Lower), ymax = exp(CI.Upper)), width = 0.2) +
    coord_flip() +  # Flip the coordinates for better readability
    theme_minimal() +
    labs(
      title = "Estimates and Confidence Intervals for Significant Predictors",
      x = "",
      y = "Estimate"
    )
  return(fig)
}

load data

integrated = readRDS(file.path(data_dir,"integrated-rpca_BDF-ref_withMeta.RDS"))
# remove cells without cell type annotations
keep.cells= rownames(integrated@meta.data)[
  !is.na(integrated@meta.data$celltype)
]

integrated= subset(
  integrated,
  cells = keep.cells
)
integrated$celltype = as.factor(integrated$celltype)
summary(integrated$celltype)
##                  DE              Ductal         Endothelial                 EnP 
##               16283               18880                1114               19845 
##                 ESC              ESC_DE               Liver Mesenchymal_Stromal 
##               15280                4559                9363                1698 
##                 PFG                 PGT                  PP            SC-alpha 
##                8440                2627               14249                9684 
##             SC-beta            SC-delta               SC-EC             Stromal 
##               10152                1856               13743                3944

order cell types

celltype_orders = c(
  "ESC",
  "ESC_DE",
  "DE",
  "PFG",
  "PGT",
  "PP",
  "EnP",
  "SC-EC",
  "SC-alpha",
  "SC-beta",
  "SC-delta",
  "Ductal",
  "Mesenchymal_Stromal",
  "Stromal",
  "Endothelial",
  "Liver"
)

integrated$celltype = factor(integrated$celltype,
                              levels =celltype_orders ,
                              ordered = TRUE)

tol_okabe16 <- c(
  "#4477AA", "#EE6677", "#228833", "#CCBB44",
  "#66CCEE", "#AA3377", "#BBBBBB", "#009E73",
  "#E69F00", "#56B4E9", "#0072B2", "#D55E00",
  "#CC79A7", "#117733", "#882255", "#DDCC77"
)

celltype_palette = setNames(
  tol_okabe16,
  celltype_orders
)
saveRDS(celltype_palette,"/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/celltype_palette.RDS")
celltype_palette = readRDS("/fh/fast/sun_w/zhaoheng_li/MSK_DEC24/Analysis/outs/celltype_palette.RDS")

filter by time points

obj = integrated%>%subset(subset = time_point%in%c("D-1","D3","D7","D11","D18"))

obj$time_point = droplevels(obj$time_point)
summary(obj$time_point)
##   D-1    D3    D7   D11   D18 
## 15775 18602 16943 12052 36357
meta_df = obj[[]]

cell_counts = meta_df %>%
  group_by(time_point, celltype, feature.4) %>%
  summarise(
    n_cells = n(),
    .groups = "drop"
  )


head(cell_counts)
## # A tibble: 6 × 4
##   time_point celltype feature.4    n_cells
##   <fct>      <ord>    <chr>          <int>
## 1 D-1        ESC      102_MNX1         164
## 2 D-1        ESC      103_MNX1         218
## 3 D-1        ESC      107_TET1/2/3     195
## 4 D-1        ESC      108_MNX1         152
## 5 D-1        ESC      109_TET1/2/3      54
## 6 D-1        ESC      110_WT           183
write.table(
  cell_counts,
  file      = file.path(out_dir,"cell_counts.tsv"),
  sep       = "\t",
  quote     = FALSE,
  row.names = FALSE
)

sample by time point

table(obj$sample_name,obj$time_point)
##    
##       D-1    D3    D7   D11   D18
##   A   289   361   439   376  1560
##   B   250   372   418   404  1602
##   C   332   285   433   418  2344
##   D   426   271   481   492  2588
##   E   373   279   519   504  2060
##   F   466   305   542   520  2558
##   G 13639     0     0     0     0
##   H     0 16729     0     0     0
##   I     0     0 14111     0     0
##   J     0     0     0  9338     0
##   L     0     0     0     0 23645

genotype by time point

table(obj$genotype,obj$time_point)
##            
##               D-1    D3    D7   D11   D18
##   ARX         201   506   586   350   462
##   BCOR        123   157   170   118   611
##   BMPR1A      235   265    83    87   219
##   FOXA1       489  1291  1160   601  1349
##   FOXA2       709   964  2045   276   276
##   GATA4       473   495   106   104   255
##   GATA4het    342   476   237   222   460
##   GATA6       311   485   369   153    82
##   GATA6het    542   719   233   206   333
##   GLIS3       508   150    67    47    39
##   GSC         376   563   194   146   266
##   HHEX        549   300   150    26    20
##   HHEXe       389   443   231    67   127
##   HHEXhet     130   139    19     3     4
##   HNF4A       141   194   120   148   309
##   HNF4Ahet    121   142    95   172   265
##   KDM2B        86    57    11     5     5
##   MNX1        535  1166  1181   913  2035
##   NANOGehet   235   205    97    95   286
##   NEUROD1     307   479   384   321  1135
##   NGN3        224   288   232   171   431
##   NKX2-2      610   926  1048  1111   909
##   ONECUT1e    655   378   416   175   375
##   OTUD5       873   725   503   276   501
##   PAX6        348   409   499   553  1014
##   PBX1        253    78   112    98   328
##   PDX1        738   623   495   553  3319
##   PDX1het     320    29    14     5     7
##   PROSER1     173   485   327   177  1081
##   QSER1       248   377   147    71   282
##   QSER1TET1   263   480   121    38   123
##   RFX6        610   754   750   281   575
##   TADA2B      212   131   146   171   387
##   TET1        165   385   293   175   245
##   TET1/2/3    256   119    90    83   247
##   TLE3        227   357   386   347  1506
##   WT         2798  2862  3826  3707 16489

cell type by time point

table(obj$celltype,obj$time_point)
##                      
##                         D-1    D3    D7   D11   D18
##   ESC                 13613  1152   209   102    62
##   ESC_DE               2095  2427    13    20     0
##   DE                     17 14964   566    67   100
##   PFG                     1     5  7520   620    75
##   PGT                     0     0     2     0     1
##   PP                      4     0    46  7871   512
##   EnP                     6     5     8  1298  4328
##   SC-EC                   6     0     2    11  7093
##   SC-alpha                7     2     0    58  4722
##   SC-beta                 0     2     2     2  5402
##   SC-delta                0     1     0    25   749
##   Ductal                  3     1    37   193 11228
##   Mesenchymal_Stromal    21    38    20   258    76
##   Stromal                 1     5  1261   961  1332
##   Endothelial             0     0   815   254     7
##   Liver                   1     0  6442   312   670

cell-type proportions by time_point

meta = obj[[]]
# ignore time points unique to external WT samples
common_times = rownames(table(meta$time_point,meta$source))[rowSums(table(meta$time_point,meta$source)!=0)==2] 


prop_by_time = meta %>%
  filter(time_point %in% common_times) %>%
  group_by(time_point, celltype) %>%
  summarise(n = n(), .groups = "drop_last") %>%
  mutate(prop = n / sum(n)) %>%         # within each time_point
  ungroup()

fig1= prop_by_time%>%ggplot(aes(x = time_point, y = prop, fill = celltype)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent_format(1)) +
  scale_fill_manual(
    values = celltype_palette,   
    drop   = TRUE                
  )+
  labs(
    title = "cell-type proportions by time point",
    x     = "",
    y     = "",
    fill  = ""
  ) +
  guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
  theme_classic() +
  theme(
    legend.position   = "top",           # ← move legend to top
    legend.direction  = "horizontal",    # ← ensure it’s laid out horizontally
    axis.text.x       = element_text(angle = 45, hjust = 1)
  )
fig1

ggsave(
  filename = file.path(out_dir,"celltype_prop_time.png"),
  plot     = fig1,
  width    = 8,
  height   = 4,
  units    = "in",
  dpi      = 300
)

cell-type proportions by genotype

prop_by_genotype = meta %>%
  group_by(genotype, celltype) %>%
  summarise(n = n(), .groups = "drop_last") %>%
  mutate(prop = n / sum(n)) %>%         # within each genotype
  ungroup()

fig2 = prop_by_genotype %>%
  ggplot(aes(x = genotype, y = prop, fill = celltype)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent_format(1)) +
  scale_fill_manual(
    values = celltype_palette,   
    drop   = TRUE                
  )+
  labs(
    title = "cell-type proportions by genotype",
    x     = "",
    y     = "",
    fill  = ""
  ) +
  guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
  theme_classic() +
  theme(
    legend.position   = "top",           # ← move legend to top
    legend.direction  = "horizontal",    # ← ensure it’s laid out horizontally
    axis.text.x       = element_text(angle = 45, hjust = 1)
  )

ggsave(
  filename = file.path(out_dir,"celltype_prop_genotype.png"),
  plot     = fig2,
  width    = 8,
  height   = 4,
  units    = "in",
  dpi      = 300
)
fig2

split seurat object by time points

obj_list = SplitObject(obj, split.by = "time_point")
obj_list
## $D18
## An object of class Seurat 
## 36390 features across 36357 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $D7
## An object of class Seurat 
## 36390 features across 16943 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $D3
## An object of class Seurat 
## 36390 features across 18602 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $D11
## An object of class Seurat 
## 36390 features across 12052 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap
## 
## $`D-1`
## An object of class Seurat 
## 36390 features across 15775 samples within 1 assay 
## Active assay: RNA (36390 features, 2000 variable features)
##  3 layers present: data, counts, scale.data
##  4 dimensional reductions calculated: pca, umap, integrated.rpca, rpca.umap

cell-type proportions by genotype (conditioned on time point)

prop_lst = lapply(c("D3","D7","D11","D18"),function(tp){
  df = obj_list[[tp]][[]]
  df %>% group_by(genotype, celltype) %>%
  summarise(count = n()) %>%
  mutate(proportion = count / sum(count)) %>%
  arrange(genotype, desc(proportion))
})
names(prop_lst) = c("D3","D7","D11","D18")
fig_list = lapply( c("D3","D7","D11","D18"),function(tp){
  df = prop_lst[[tp]] 
  df %>%
    ggplot(aes(x = genotype, y = proportion, fill = celltype)) +
    geom_col() +
    scale_y_continuous(labels = scales::percent_format(1)) +
    scale_fill_manual(
      values = celltype_palette,   
      drop   = TRUE      
    )+
    labs(
      title = tp,
      x     = "",
      y     = "cell-type proportions",
      fill  = ""
    ) +
    guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
    theme_classic() +
    theme(
      legend.position   = "top",           # ← move legend to top
      legend.direction  = "horizontal",    # ← ensure it’s laid out horizontally
      axis.text.x       = element_text(angle = 45, hjust = 1)
    )
})
names(fig_list) = c("D3","D7","D11","D18")

for(tp in names(fig_list)){
  ggsave(file.path(out_dir,paste0("genotype_celltype_proportion/",tp,".png",sep='')),
         plot     = fig_list[[tp]],
  width    = 8,
  height   = 6,
  units    = "in",
  dpi      = 300
  )
}

fig_list
## $D3

## 
## $D7

## 
## $D11

## 
## $D18

genotype-wise similarity using cell type compositions (conditioned on time point)

using centering log ratio transformation

fig_list = list()
for(tp in names(prop_lst)){
  
  df = prop_lst[[tp]]
  df$celltype = as.factor(df$celltype)
  levels(df$celltype)=gsub("_","-",levels(df$celltype)) # rename celltype levels
  
  df = df%>%
    group_by(genotype)%>%
    mutate(geom_mean = exp(mean(log(proportion))))%>%
    mutate(clr = log(proportion/geom_mean))
  
  df$celltype = droplevels(df$celltype)
  print(unique(df$celltype))
  
  clr = df %>%
    select(genotype, celltype, clr) %>%
    tidyr::spread(key = celltype, value = clr) %>%
    tibble::column_to_rownames(var = "genotype") %>%
    as.matrix()
  
  clr[is.na(clr)] = 2*min(clr,na.rm=T) # deal with NA values
  
  clr_dist = as.matrix(dist(clr, method = "euclidean"))
  hc = hclust(as.dist(clr_dist))
  d = mean(  clr_dist)
  fig = pheatmap::pheatmap(exp(-clr_dist^2/(2*d)),
                           clustering_method = 'ward.D',
                           cluster_rows = hc,cluster_cols =hc,main = tp,fontsize = 5)
  fig_list[[tp]] = fig[[4]]
}
##  [1] DE                  ESC-DE              ESC                
##  [4] Mesenchymal-Stromal PFG                 EnP                
##  [7] Stromal             SC-alpha            SC-beta            
## [10] SC-delta            Ductal             
## 11 Levels: ESC < ESC-DE < DE < PFG < EnP < SC-alpha < SC-beta < ... < Stromal

##  [1] Liver               PFG                 Stromal            
##  [4] DE                  Endothelial         ESC                
##  [7] Ductal              ESC-DE              Mesenchymal-Stromal
## [10] PP                  PGT                 EnP                
## [13] SC-EC               SC-beta            
## 14 Levels: ESC < ESC-DE < DE < PFG < PGT < PP < EnP < SC-EC < ... < Liver

##  [1] PP                  EnP                 Stromal            
##  [4] PFG                 Endothelial         Liver              
##  [7] DE                  SC-delta            Mesenchymal-Stromal
## [10] Ductal              ESC                 ESC-DE             
## [13] SC-alpha            SC-EC               SC-beta            
## 15 Levels: ESC < ESC-DE < DE < PFG < PP < EnP < SC-EC < ... < Liver

##  [1] Ductal              EnP                 SC-beta            
##  [4] SC-EC               Stromal             SC-delta           
##  [7] Liver               PP                  SC-alpha           
## [10] DE                  ESC                 PFG                
## [13] Mesenchymal-Stromal Endothelial         PGT                
## 15 Levels: ESC < DE < PFG < PGT < PP < EnP < SC-EC < SC-alpha < ... < Liver

g = gridExtra::grid.arrange(gridExtra::arrangeGrob(grobs= fig_list,ncol=2))

ggsave(file.path(out_dir,"geno_similarity_cell-type-composition.png"),g)
g
## TableGrob (1 x 1) "arrange": 1 grobs
##   z     cells    name            grob
## 1 1 (1-1,1-1) arrange gtable[arrange]

create pseudobulk data

f_pseudobulk_list = file.path(out_dir,"ct_composition_pseudobulk_list.RDS")

if(file.exists(f_pseudobulk_list)){
  pseudo_obj_list = readRDS(f_pseudobulk_list)
}else{
  pseudo_obj_list = list()
  for(tp in c("D-1","D3","D7","D11","D18")){
    print(paste0("working on ", tp))
    obj = obj_list[[tp]]
    obj$background = as.factor(obj$background)
    obj$genotype = as.factor(obj$genotype)
    levels(obj$celltype)=gsub("_","-",levels(obj$celltype)) # rename celltype levels
    obj$celltype = droplevels(obj$celltype)
    obj$pseudo.sample.id = as.factor(paste(obj$sample_name,obj$background,obj$genotype,obj$feature.4,sep='_')) # handle cell type names
    pseudo_obj= AggregateExpression(obj, assays = "RNA", return.seurat = T,
                                        group.by = c("sample_name","background","genotype", "feature.4"))
    obj$pseudo.sample.id =  gsub("-","_",obj$pseudo.sample.id)
    pseudo_obj$pseudo.sample.id = gsub("-","_", colnames(pseudo_obj))
    pseudo_meta = pseudo_obj[[]]
    df = obj[[]] %>% 
      group_by(pseudo.sample.id, celltype) %>%
    summarise(count = n(),.groups = "drop") %>%
    tidyr::complete(pseudo.sample.id, celltype, fill = list(count = 0))%>%
       group_by(pseudo.sample.id) %>%
    mutate(proportion = count / sum(count)) 
    
    df_wide = df%>%tidyr::pivot_wider(names_from = celltype, values_from = c(count,proportion))
    pseudo_meta  = pseudo_meta %>%left_join(df_wide)
    rownames(pseudo_meta) = pseudo_meta$orig.ident
    
    n.df=obj[[]]%>%dplyr::count(pseudo.sample.id)
    pseudo_meta = pseudo_meta %>%left_join(n.df)
    rownames(pseudo_meta) = pseudo_meta$orig.ident
    
    pseudo_obj@meta.data =  pseudo_meta 
    
    pseudo_obj$source =as.factor(ifelse(pseudo_obj$sample_name%in%c("A","B","C","D","E","F"),"external_WT", "KO_village" ))
    
    pseudo_obj_list[[tp]] = pseudo_obj
  }
  saveRDS(pseudo_obj_list,f_pseudobulk_list)
}
names(celltype_palette) = gsub("_","-",names(celltype_palette))

check sample compositions for each time point

for(tp in c("D-1","D3","D7","D11","D18")){
  print(tp)
  df = pseudo_obj_list[[tp]][[]]
  print(summary(as.factor(df$sample_name)))
}
## [1] "D-1"
##  A  B  C  D  E  F  G 
##  2  2  2  2  2  2 81 
## [1] "D3"
##  A  B  C  D  E  F  H 
##  2  2  2  2  2  2 81 
## [1] "D7"
##  A  B  C  D  E  F  I 
##  2  2  2  2  2  2 81 
## [1] "D11"
##  A  B  C  D  E  F  J 
##  2  2  2  2  2  2 80 
## [1] "D18"
##  A  B  C  D  E  F  L 
##  1  1  1  1  1  1 80

remove pseudobulk samples with < 30 cells

min_cells = 30
pseudo_obj_list = lapply(pseudo_obj_list,function(pseudo_obj){
  pseudo_obj%>%subset(subset = n>=min_cells)
})
pseudo_obj_list
## $`D-1`
## An object of class Seurat 
## 36390 features across 93 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D3
## An object of class Seurat 
## 36390 features across 86 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D7
## An object of class Seurat 
## 36390 features across 84 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D11
## An object of class Seurat 
## 36390 features across 72 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data
## 
## $D18
## An object of class Seurat 
## 36390 features across 73 samples within 1 assay 
## Active assay: RNA (36390 features, 0 variable features)
##  3 layers present: counts, data, scale.data

barplot for pseudobulk cell type proportions

barplot_list = list()
for(tp in c("D3","D7","D11","D18")){
  pseudo_obj = pseudo_obj_list[[tp]]
  df = pseudo_obj[[]]
  df_long= df %>%
  tidyr::pivot_longer(cols = starts_with("proportion_"), 
               names_to = "celltype", 
               values_to = "Proportion")
  df_long$celltype = gsub("proportion_","",df_long$celltype) 
  df_long = df_long %>%
  mutate(
    celltype = factor(celltype, levels = gsub("_","-",celltype_orders),ordered = TRUE)
  )
  
  fig = ggplot(df_long, aes(x = pseudo.sample.id, y = Proportion, fill = celltype)) +
    geom_col(position = "stack") +
    scale_fill_manual(
      values =celltype_palette,   
      drop   = TRUE                
    )+
    facet_wrap(~ genotype, 
               nrow = 2, 
               scales = "free_x", 
               strip.position = "top") +
    ylab("cell type proportion") +
    xlab("pseudobulk samples") +
    labs(
      title = tp,
      x     = "pseudobulk samples",
      y     = "cell type proportion",
      fill  = ""
    ) +
    guides(fill = guide_legend(nrow = 2, byrow = TRUE)) +
    theme_pubr() +
    theme(
      strip.text = element_text(angle = 90, size = 8, hjust = 0, face = "bold"),
      strip.background = element_rect(colour = "black", fill = "grey95", linewidth = 0),
      axis.text.x = element_blank(),
      axis.line = element_blank(),
      axis.ticks = element_blank()
    )

  barplot_list[[tp]] = fig
}

for(tp in names(barplot_list)){
  ggsave(file.path(out_dir,paste0("pseudobulk_celltype_proportion/",tp,".png",sep='')),
         plot     = barplot_list[[tp]],
  width    = 12,
  height   = 10,
  units    = "in",
  dpi      = 300
  )
}


barplot_list
## $D3

## 
## $D7

## 
## $D11

## 
## $D18

check distribution of cell type log proportions

for( tp in c("D3","D7","D11","D18")){
  obj = pseudo_obj_list[[tp]]
  df =  obj[[]]
  ct_response =  gsub("count_","",grep("^count_", names(df), value = TRUE))
  n = nrow(df)
  # create log proportions
  for(ct in ct_response){
    df[,paste("proportion0",ct,sep="_")] = df[,paste("proportion",ct,sep="_")]*(n-1)/n+0.5/n # transformation used by DR_data
    df[[paste("log_prop", ct,sep="_")]] = log(df[,paste("proportion0",ct,sep="_")] )
    print(paste(ct, " # Inf: ", sum(is.infinite(df[[paste("log_prop", ct,sep="_")]] )),sep=''))
  }
  
  for(c in grep("^log_prop_", colnames(df), value = TRUE)){
    hist(as.vector(df[c])[[1]],main = paste(tp, c,sep=" : "),xlab = "log proportion")
  }
}
## [1] "ESC # Inf: 0"
## [1] "ESC-DE # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-alpha # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "SC-delta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"

## [1] "ESC # Inf: 0"
## [1] "ESC-DE # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "PGT # Inf: 0"
## [1] "PP # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-EC # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"
## [1] "Endothelial # Inf: 0"
## [1] "Liver # Inf: 0"

## [1] "ESC # Inf: 0"
## [1] "ESC-DE # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "PP # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-EC # Inf: 0"
## [1] "SC-alpha # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "SC-delta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"
## [1] "Endothelial # Inf: 0"
## [1] "Liver # Inf: 0"

## [1] "ESC # Inf: 0"
## [1] "DE # Inf: 0"
## [1] "PFG # Inf: 0"
## [1] "PGT # Inf: 0"
## [1] "PP # Inf: 0"
## [1] "EnP # Inf: 0"
## [1] "SC-EC # Inf: 0"
## [1] "SC-alpha # Inf: 0"
## [1] "SC-beta # Inf: 0"
## [1] "SC-delta # Inf: 0"
## [1] "Ductal # Inf: 0"
## [1] "Mesenchymal-Stromal # Inf: 0"
## [1] "Stromal # Inf: 0"
## [1] "Endothelial # Inf: 0"
## [1] "Liver # Inf: 0"

run linear model and plot 95% confidence intervals of significant predictors

fig_list =list()
coef_list = list()
p_list = list()
for( tp in c("D3","D7","D11","D18")){
  df =  pseudo_obj_list[[tp]][[]]
  ct_response = gsub("count_","",grep("^count_", names(df), value = TRUE))
  n = nrow(df)
  # create log proportions
  for(ct in ct_response){
    df[,paste("proportion0",ct,sep="_")] = df[,paste("proportion",ct,sep="_")]*(n-1)/n+0.5/n # transformation used by DR_data
    df[[paste("log_prop", ct,sep="_")]] = log(df[,paste("proportion0",ct,sep="_")] )
  }
  
  mod_tp = list()
  coef_mtx = list()
  p_mtx = list()
  for(ct in ct_response){
    # run regression
    if(max(df[,paste("proportion",ct,sep="_")])>0.03){
      df$genotype = as.factor(df$genotype)
      df$genotype = relevel(df$genotype, ref = "WT")
      v = as.symbol(paste("log_prop", ct,sep="_"))
      mod = eval(bquote(lm(.(v)~source+background+genotype,data=df)))
      mod_tp[[ct]] = mod
      coef_mtx[[ct]] = summary(mod)$coefficients[grepl("genotype", names(summary(mod)$coefficients[,4])),1]
      p_mtx[[ct]] = summary(mod)$coefficients[grepl("genotype", names(summary(mod)$coefficients[,4])),4]
    }
  }
  coef_mtx = do.call(cbind,coef_mtx);rownames(coef_mtx) = gsub("genotype","",rownames(coef_mtx) )
  p_mtx = do.call(cbind,p_mtx);rownames(p_mtx) = gsub("genotype","",rownames(p_mtx) )
  coef_list[[tp]] = coef_mtx
  p_list[[tp]] = p_mtx
  
  # visualize results
  for(ct in names(mod_tp)){
    if(any(p_mtx[,ct]<=0.05)){
          mod = mod_tp[[ct]]
      fig = lm_plot(mod)+
                   ggtitle(paste(tp, ":", ct,sep=''))+
        theme(plot.title = element_text(size = 10, face = "bold"))
      fig_list[[paste(tp,ct,sep="_")]] = fig
    }
  }
}

only cell types with at least one genotype as a significant predictors are shown

coef_fig = gridExtra::grid.arrange(grobs = fig_list,ncol = 6,top = "Significant Coefficients on cell-type log(proportion)")

print(coef_fig)
## TableGrob (7 x 6) "arrange": 33 grobs
##                          z     cells    name                  grob
## D3_ESC                   1 (2-2,1-1) arrange        gtable[layout]
## D3_ESC-DE                2 (2-2,2-2) arrange        gtable[layout]
## D3_DE                    3 (2-2,3-3) arrange        gtable[layout]
## D7_ESC                   4 (2-2,4-4) arrange        gtable[layout]
## D7_ESC-DE                5 (2-2,5-5) arrange        gtable[layout]
## D7_DE                    6 (2-2,6-6) arrange        gtable[layout]
## D7_PFG                   7 (3-3,1-1) arrange        gtable[layout]
## D7_Ductal                8 (3-3,2-2) arrange        gtable[layout]
## D7_Stromal               9 (3-3,3-3) arrange        gtable[layout]
## D7_Endothelial          10 (3-3,4-4) arrange        gtable[layout]
## D7_Liver                11 (3-3,5-5) arrange        gtable[layout]
## D11_ESC                 12 (3-3,6-6) arrange        gtable[layout]
## D11_ESC-DE              13 (4-4,1-1) arrange        gtable[layout]
## D11_PFG                 14 (4-4,2-2) arrange        gtable[layout]
## D11_PP                  15 (4-4,3-3) arrange        gtable[layout]
## D11_EnP                 16 (4-4,4-4) arrange        gtable[layout]
## D11_SC-alpha            17 (4-4,5-5) arrange        gtable[layout]
## D11_Ductal              18 (4-4,6-6) arrange        gtable[layout]
## D11_Mesenchymal-Stromal 19 (5-5,1-1) arrange        gtable[layout]
## D11_Stromal             20 (5-5,2-2) arrange        gtable[layout]
## D11_Endothelial         21 (5-5,3-3) arrange        gtable[layout]
## D11_Liver               22 (5-5,4-4) arrange        gtable[layout]
## D18_ESC                 23 (5-5,5-5) arrange        gtable[layout]
## D18_PP                  24 (5-5,6-6) arrange        gtable[layout]
## D18_EnP                 25 (6-6,1-1) arrange        gtable[layout]
## D18_SC-EC               26 (6-6,2-2) arrange        gtable[layout]
## D18_SC-alpha            27 (6-6,3-3) arrange        gtable[layout]
## D18_SC-beta             28 (6-6,4-4) arrange        gtable[layout]
## D18_SC-delta            29 (6-6,5-5) arrange        gtable[layout]
## D18_Ductal              30 (6-6,6-6) arrange        gtable[layout]
## D18_Stromal             31 (7-7,1-1) arrange        gtable[layout]
## D18_Liver               32 (7-7,2-2) arrange        gtable[layout]
##                         33 (1-1,1-6) arrange text[GRID.text.12718]
ggsave(file.path(out_dir,"log_proportion_lm_coef.png"),coef_fig,width = 15,height=15)

heatmaps of estimated coefficients for genotypes

values are truncated at \(\pm 3\) for visualization purpose

meta%>%filter(time_point=='D3')%>%group_by(celltype)%>%summarise(n=n())
## # A tibble: 11 × 2
##    celltype                n
##    <ord>               <int>
##  1 ESC                  1152
##  2 ESC_DE               2427
##  3 DE                  14964
##  4 PFG                     5
##  5 EnP                     5
##  6 SC-alpha                2
##  7 SC-beta                 2
##  8 SC-delta                1
##  9 Ductal                  1
## 10 Mesenchymal_Stromal    38
## 11 Stromal                 5
htmap_list = list()
for(tp in names(coef_list)){
  coef_mtx =coef_list[[tp]]
  coef_mtx[coef_mtx>3] = 3;coef_mtx[coef_mtx=3] = -3
  htmap = pheatmap::pheatmap(coef_mtx,cluster_cols = F,fontsize = 5,
                      color = colorRampPalette(c("blue", "white", "red"))(50),
                      breaks = seq(min(coef_mtx,na.rm=T), max(coef_mtx,na.rm=T), length.out = 51),
                     main = tp)
  htmap_list[[tp]] = htmap
}

names(htmap_list) = names(coef_list)

for(tp in names(htmap_list)){
  png(file.path(out_dir,paste0("coefs_htmaps/",tp,".png",sep='')),width = 8,height = 6,unit = "in",res = 1000)
    print(htmap_list[[tp]])
  dev.off()
  print(htmap)
}

dotplots of genotype effects on cell type proportions

dotplot_df = imap_dfr(p_list, function(pmat, tp) {
  cmat = coef_list[[tp]]
  as_tibble(pmat, rownames="genotype") %>%
    pivot_longer(-genotype, names_to="cell_type", values_to="pval") %>%
    left_join(
      as_tibble(cmat, rownames="genotype") %>%
        pivot_longer(-genotype, names_to="cell_type", values_to="coef"),
      by = c("genotype","cell_type")
    ) %>%
    mutate(time_point = tp,
           neglog10_p  = -log10(pval))
})

dotplot_df = dotplot_df %>%
  mutate(time_point = factor(time_point,
                             levels = c("D3","D7","D11","D18"),
                             ordered = TRUE))
dotplot_list = map(c("D3","D7","D11","D18"), function(tp) {

  
  coef_mtx = coef_list[[tp]]

  genotype_order = rownames(coef_mtx)[ htmap_list[[tp]]$tree_row$order ]
  
  df_tp = filter(dotplot_df, time_point == tp) |>
           mutate(
             genotype  = factor(genotype,
                                levels = genotype_order,
                                ordered = TRUE),
             cell_type = factor(cell_type,
                                levels = celltype_orders,
                                ordered = TRUE)
           ) |>
           drop_na(genotype, cell_type)      # drops cell types not present
  ggplot(df_tp,
         aes(cell_type, genotype,
             size  = neglog10_p,
             color = coef)) +
    geom_point() +
    scale_size(range = c(0.5, 6),
               name  = expression(-log[10]~italic(p))) +
    scale_color_gradient2(low      = "#2166AC",
                          mid      = "white",
                          high     = "#B2182B",
                          midpoint = 0,
                          name     = "Coef") +
    labs(title = tp, x = NULL, y = NULL) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          plot.title  = element_text(hjust = 0.5))
})
names(dotplot_list) = c("D3","D7","D11","D18")
dotplot_list
## $D3

## 
## $D7

## 
## $D11

## 
## $D18

for(tp in names(dotplot_list)){
  
ggsave(file.path(out_dir, paste0("coefs_dotplots/", tp, ".png",sep='')),
               plot     = dotplot_list[[tp]],
              width = 6,height = 9,
              unit = "in",
               dpi      = 300)
}

pdf(file.path(out_dir,"coefs_dotplots/coefs_dotplots.pdf"), width = 6, height = 9)
for(tp in names(dotplot_list)){
  print(dotplot_list[[tp]])
}
dev.off()
## png 
##   2
# take the order from the heat-map dendrogram of *one* reference TP.
genotype_order = rownames(coef_list[["D18"]])[ htmap_list[["D18"]]$tree_row$order ]

tp_levels      = levels(dotplot_df$time_point)
tp_ct_levels = unlist(
  lapply(tp_levels, \(tp) paste(tp, celltype_orders, sep = "_"))
)

plot_df = dotplot_df %>% 
  mutate(genotype = factor(genotype,
                           levels  = genotype_order,
                           ordered = TRUE),
         tp_ct  = factor(paste(time_point, cell_type, sep = "_"),
                           levels  = tp_ct_levels,
                           ordered = TRUE)) %>% 
  drop_na(genotype, tp_ct)

dotplot = plot_df%>%
  ggplot(aes(x = tp_ct,         
           y = genotype,
           size  = neglog10_p,
           colour = coef)) +
  geom_point() +
  scale_size(range = c(0.3, 4),
             name  = expression(-log[10]~italic(p))) +
  scale_colour_gradient2(low = "#2166AC",
                         mid = "white",
                         high = "#B2182B",
                         midpoint = 0,
                         name = "Coef") +
  labs(x = NULL, y = NULL) +
  theme_bw(base_size = 8) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        panel.grid.major.x = element_blank(),      # optional: declutter grid
        plot.title = element_text(hjust = 0.5))
dotplot

ggsave(file.path(out_dir,"coefs_dotplots/all.png"),
               plot  = dotplot,
              width = 5.5,height = 6,
              unit = "in",
               dpi      = 300)

Session information

gc()
##              used    (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells    8535650   455.9   14173958   757.0   14173958   757.0
## Vcells 3271527810 24959.8 7505063610 57259.1 6139819429 46843.2
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Los_Angeles
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] lubridate_1.9.3             forcats_1.0.0              
##  [3] purrr_1.0.2                 readr_2.1.5                
##  [5] tidyverse_2.0.0             ggpubr_0.6.0               
##  [7] ggrepel_0.9.5               gridExtra_2.3              
##  [9] DESeq2_1.44.0               SummarizedExperiment_1.34.0
## [11] Biobase_2.64.0              MatrixGenerics_1.16.0      
## [13] matrixStats_1.3.0           GenomicRanges_1.56.0       
## [15] GenomeInfoDb_1.40.0         IRanges_2.38.0             
## [17] S4Vectors_0.42.0            BiocGenerics_0.50.0        
## [19] readxl_1.4.3                reshape2_1.4.4             
## [21] pheatmap_1.0.12             stringr_1.5.1              
## [23] patchwork_1.2.0             cowplot_1.1.3              
## [25] tibble_3.2.1                ggplot2_3.5.1              
## [27] tidyr_1.3.1                 dplyr_1.1.4                
## [29] Matrix_1.7-0                Seurat_5.1.0               
## [31] SeuratObject_5.0.2          sp_2.1-4                   
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22        splines_4.4.0           later_1.3.2            
##   [4] cellranger_1.1.0        polyclip_1.10-6         fastDummies_1.7.3      
##   [7] lifecycle_1.0.4         rstatix_0.7.2           globals_0.16.3         
##  [10] lattice_0.22-6          MASS_7.3-60.2           backports_1.4.1        
##  [13] magrittr_2.0.3          plotly_4.10.4           sass_0.4.9             
##  [16] rmarkdown_2.26          jquerylib_0.1.4         yaml_2.3.8             
##  [19] httpuv_1.6.15           sctransform_0.4.1       spam_2.10-0            
##  [22] spatstat.sparse_3.0-3   reticulate_1.36.1       pbapply_1.7-2          
##  [25] RColorBrewer_1.1-3      abind_1.4-5             zlibbioc_1.50.0        
##  [28] Rtsne_0.17              GenomeInfoDbData_1.2.12 irlba_2.3.5.1          
##  [31] listenv_0.9.1           spatstat.utils_3.0-4    goftest_1.2-3          
##  [34] RSpectra_0.16-1         spatstat.random_3.2-3   fitdistrplus_1.2-1     
##  [37] parallelly_1.37.1       leiden_0.4.3.1          codetools_0.2-20       
##  [40] DelayedArray_0.30.1     tidyselect_1.2.1        UCSC.utils_1.0.0       
##  [43] farver_2.1.2            spatstat.explore_3.2-7  jsonlite_1.8.8         
##  [46] progressr_0.14.0        ggridges_0.5.6          survival_3.6-4         
##  [49] systemfonts_1.1.0       tools_4.4.0             ragg_1.3.1             
##  [52] ica_1.0-3               Rcpp_1.0.12             glue_1.7.0             
##  [55] SparseArray_1.4.3       xfun_0.44               withr_3.0.0            
##  [58] fastmap_1.2.0           fansi_1.0.6             digest_0.6.35          
##  [61] timechange_0.3.0        R6_2.5.1                mime_0.12              
##  [64] textshaping_0.3.7       colorspace_2.1-0        scattermore_1.2        
##  [67] tensor_1.5              spatstat.data_3.0-4     utf8_1.2.4             
##  [70] generics_0.1.3          data.table_1.15.4       httr_1.4.7             
##  [73] htmlwidgets_1.6.4       S4Arrays_1.4.0          uwot_0.2.2             
##  [76] pkgconfig_2.0.3         gtable_0.3.5            lmtest_0.9-40          
##  [79] XVector_0.44.0          htmltools_0.5.8.1       carData_3.0-5          
##  [82] dotCall64_1.1-1         scales_1.3.0            png_0.1-8              
##  [85] knitr_1.46              rstudioapi_0.16.0       tzdb_0.4.0             
##  [88] nlme_3.1-164            cachem_1.0.8            zoo_1.8-12             
##  [91] KernSmooth_2.23-22      parallel_4.4.0          miniUI_0.1.1.1         
##  [94] pillar_1.9.0            grid_4.4.0              vctrs_0.6.5            
##  [97] RANN_2.6.1              promises_1.3.0          car_3.1-2              
## [100] xtable_1.8-4            cluster_2.1.6           evaluate_0.23          
## [103] cli_3.6.2               locfit_1.5-9.9          compiler_4.4.0         
## [106] rlang_1.1.3             crayon_1.5.2            future.apply_1.11.2    
## [109] ggsignif_0.6.4          labeling_0.4.3          plyr_1.8.9             
## [112] stringi_1.8.4           viridisLite_0.4.2       deldir_2.0-4           
## [115] BiocParallel_1.38.0     munsell_0.5.1           lazyeval_0.2.2         
## [118] spatstat.geom_3.2-9     RcppHNSW_0.6.0          hms_1.1.3              
## [121] future_1.33.2           shiny_1.8.1.1           highr_0.10             
## [124] ROCR_1.0-11             igraph_2.0.3            broom_1.0.5            
## [127] bslib_0.7.0